package com.ly.crawl;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

/**
 * 信息收集器
 */
public interface Capturer<T> {

    /**
     * 查找指定资源
     *
     * @param url 查找的网页地址
     */
    T capture(String url, Document doc);

    default String getValue(Element e, String field) {
        String regex = null;
        if (field != null && field.matches("\\w+\\(.+\\)")) {
            regex = field.replaceAll("\\w+\\((.+)\\)", "$1");
            field = field.replaceAll("(\\w+)\\(.+\\)", "$1");
        }
        String ret;
        if (field == null || "text".equals(field)) {
            ret = e.text();
        } else if ("html".equals(field)) {
            ret = e.html();
        } else if ("script".equals(field)) {
            ret = e.getElementsByTag("script").html();
        } else {
            ret = e.attr(field);
        }
        if (regex != null) {
            ret = ret.replaceAll(regex, "$1");
        }
        return ret;
    }

}
